In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from math import radians, sin, cos, sqrt, atan2
import warnings
import folium
warnings.filterwarnings("ignore")
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OrdinalEncoder,OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import KNNImputer
from sklearn.model_selection import train_test_split
In [2]:
df = pd.read_csv("C:\\Users\\TransOrg\\Downloads\\finalTrain.csv")
df.head()
Out[2]:
| ID | Delivery_person_ID | Delivery_person_Age | Delivery_person_Ratings | Restaurant_latitude | Restaurant_longitude | Delivery_location_latitude | Delivery_location_longitude | Order_Date | Time_Orderd | Time_Order_picked | Weather_conditions | Road_traffic_density | Vehicle_condition | Type_of_order | Type_of_vehicle | multiple_deliveries | Festival | City | Time_taken (min) | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0xcdcd | DEHRES17DEL01 | 36.0 | 4.2 | 30.327968 | 78.046106 | 30.397968 | 78.116106 | 12-02-2022 | 21:55 | 22:10 | Fog | Jam | 2 | Snack | motorcycle | 3.0 | No | Metropolitian | 46 |
| 1 | 0xd987 | KOCRES16DEL01 | 21.0 | 4.7 | 10.003064 | 76.307589 | 10.043064 | 76.347589 | 13-02-2022 | 14:55 | 15:05 | Stormy | High | 1 | Meal | motorcycle | 1.0 | No | Metropolitian | 23 |
| 2 | 0x2784 | PUNERES13DEL03 | 23.0 | 4.7 | 18.562450 | 73.916619 | 18.652450 | 74.006619 | 04-03-2022 | 17:30 | 17:40 | Sandstorms | Medium | 1 | Drinks | scooter | 1.0 | No | Metropolitian | 21 |
| 3 | 0xc8b6 | LUDHRES15DEL02 | 34.0 | 4.3 | 30.899584 | 75.809346 | 30.919584 | 75.829346 | 13-02-2022 | 09:20 | 09:30 | Sandstorms | Low | 0 | Buffet | motorcycle | 0.0 | No | Metropolitian | 20 |
| 4 | 0xdb64 | KNPRES14DEL02 | 24.0 | 4.7 | 26.463504 | 80.372929 | 26.593504 | 80.502929 | 14-02-2022 | 19:50 | 20:05 | Fog | Jam | 1 | Snack | scooter | 1.0 | No | Metropolitian | 41 |
In [3]:
df.shape
Out[3]:
(45584, 20)
In [4]:
df.describe()
Out[4]:
| Delivery_person_Age | Delivery_person_Ratings | Restaurant_latitude | Restaurant_longitude | Delivery_location_latitude | Delivery_location_longitude | Vehicle_condition | multiple_deliveries | Time_taken (min) | |
|---|---|---|---|---|---|---|---|---|---|
| count | 43730.000000 | 43676.000000 | 45584.000000 | 45584.000000 | 45584.000000 | 45584.000000 | 45584.000000 | 44591.000000 | 45584.000000 |
| mean | 29.566911 | 4.633774 | 17.017948 | 70.229684 | 17.465480 | 70.844161 | 1.023385 | 0.744635 | 26.293963 |
| std | 5.815064 | 0.334744 | 8.185674 | 22.885575 | 7.335562 | 21.120578 | 0.839055 | 0.572510 | 9.384298 |
| min | 15.000000 | 1.000000 | -30.905562 | -88.366217 | 0.010000 | 0.010000 | 0.000000 | 0.000000 | 10.000000 |
| 25% | 25.000000 | 4.500000 | 12.933284 | 73.170000 | 12.988453 | 73.280000 | 0.000000 | 0.000000 | 19.000000 |
| 50% | 30.000000 | 4.700000 | 18.551440 | 75.897963 | 18.633934 | 76.002574 | 1.000000 | 1.000000 | 26.000000 |
| 75% | 35.000000 | 4.900000 | 22.728163 | 78.044095 | 22.785049 | 78.107044 | 2.000000 | 1.000000 | 32.000000 |
| max | 50.000000 | 6.000000 | 30.914057 | 88.433452 | 31.054057 | 88.563452 | 3.000000 | 3.000000 | 54.000000 |
In [5]:
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 45584 entries, 0 to 45583 Data columns (total 20 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 ID 45584 non-null object 1 Delivery_person_ID 45584 non-null object 2 Delivery_person_Age 43730 non-null float64 3 Delivery_person_Ratings 43676 non-null float64 4 Restaurant_latitude 45584 non-null float64 5 Restaurant_longitude 45584 non-null float64 6 Delivery_location_latitude 45584 non-null float64 7 Delivery_location_longitude 45584 non-null float64 8 Order_Date 45584 non-null object 9 Time_Orderd 43853 non-null object 10 Time_Order_picked 45584 non-null object 11 Weather_conditions 44968 non-null object 12 Road_traffic_density 44983 non-null object 13 Vehicle_condition 45584 non-null int64 14 Type_of_order 45584 non-null object 15 Type_of_vehicle 45584 non-null object 16 multiple_deliveries 44591 non-null float64 17 Festival 45356 non-null object 18 City 44384 non-null object 19 Time_taken (min) 45584 non-null int64 dtypes: float64(7), int64(2), object(11) memory usage: 7.0+ MB
In [6]:
df.isnull().sum()
Out[6]:
ID 0 Delivery_person_ID 0 Delivery_person_Age 1854 Delivery_person_Ratings 1908 Restaurant_latitude 0 Restaurant_longitude 0 Delivery_location_latitude 0 Delivery_location_longitude 0 Order_Date 0 Time_Orderd 1731 Time_Order_picked 0 Weather_conditions 616 Road_traffic_density 601 Vehicle_condition 0 Type_of_order 0 Type_of_vehicle 0 multiple_deliveries 993 Festival 228 City 1200 Time_taken (min) 0 dtype: int64
In [7]:
for i in df.columns:
print(f" no. of unique value in {i} is : {df[i].nunique()}")
print("#######################################################")
if (df[i].nunique()) < 8:
print(f" unique value are : {df[i].unique()}")
print("***************************************************")
else:
pass
no. of unique value in ID is : 45584
#######################################################
no. of unique value in Delivery_person_ID is : 1320
#######################################################
no. of unique value in Delivery_person_Age is : 22
#######################################################
no. of unique value in Delivery_person_Ratings is : 28
#######################################################
no. of unique value in Restaurant_latitude is : 657
#######################################################
no. of unique value in Restaurant_longitude is : 518
#######################################################
no. of unique value in Delivery_location_latitude is : 4373
#######################################################
no. of unique value in Delivery_location_longitude is : 4373
#######################################################
no. of unique value in Order_Date is : 44
#######################################################
no. of unique value in Time_Orderd is : 176
#######################################################
no. of unique value in Time_Order_picked is : 193
#######################################################
no. of unique value in Weather_conditions is : 6
#######################################################
unique value are : ['Fog' 'Stormy' 'Sandstorms' 'Windy' 'Cloudy' 'Sunny' nan]
***************************************************
no. of unique value in Road_traffic_density is : 4
#######################################################
unique value are : ['Jam' 'High' 'Medium' 'Low' nan]
***************************************************
no. of unique value in Vehicle_condition is : 4
#######################################################
unique value are : [2 1 0 3]
***************************************************
no. of unique value in Type_of_order is : 4
#######################################################
unique value are : ['Snack' 'Meal' 'Drinks' 'Buffet']
***************************************************
no. of unique value in Type_of_vehicle is : 4
#######################################################
unique value are : ['motorcycle' 'scooter' 'electric_scooter' 'bicycle']
***************************************************
no. of unique value in multiple_deliveries is : 4
#######################################################
unique value are : [ 3. 1. 0. nan 2.]
***************************************************
no. of unique value in Festival is : 2
#######################################################
unique value are : ['No' 'Yes' nan]
***************************************************
no. of unique value in City is : 3
#######################################################
unique value are : ['Metropolitian' 'Urban' 'Semi-Urban' nan]
***************************************************
no. of unique value in Time_taken (min) is : 45
#######################################################
In [8]:
# lets create some features with the help of our existing features
feature = []
dtypes = []
count = []
unique = []
missing = []
missing_percentage=[]
for column in df.columns:
feature.append(column)
count.append(len(df[column]))
missing.append(df[column].isnull().sum())
unique.append(df[column].nunique())
missing_percentage.append((df[column].isnull().sum() / df.shape[0]) * 100)
dtypes.append(df[column].dtypes)
dataframe=pd.DataFrame({'feature':feature,
'count':count,
'missing':missing,
'nunique':unique,
'missing_percentage':missing_percentage,
'dtypes':dtypes
})
dataframe.set_index('feature')
Out[8]:
| count | missing | nunique | missing_percentage | dtypes | |
|---|---|---|---|---|---|
| feature | |||||
| ID | 45584 | 0 | 45584 | 0.000000 | object |
| Delivery_person_ID | 45584 | 0 | 1320 | 0.000000 | object |
| Delivery_person_Age | 45584 | 1854 | 22 | 4.067217 | float64 |
| Delivery_person_Ratings | 45584 | 1908 | 28 | 4.185679 | float64 |
| Restaurant_latitude | 45584 | 0 | 657 | 0.000000 | float64 |
| Restaurant_longitude | 45584 | 0 | 518 | 0.000000 | float64 |
| Delivery_location_latitude | 45584 | 0 | 4373 | 0.000000 | float64 |
| Delivery_location_longitude | 45584 | 0 | 4373 | 0.000000 | float64 |
| Order_Date | 45584 | 0 | 44 | 0.000000 | object |
| Time_Orderd | 45584 | 1731 | 176 | 3.797385 | object |
| Time_Order_picked | 45584 | 0 | 193 | 0.000000 | object |
| Weather_conditions | 45584 | 616 | 6 | 1.351351 | object |
| Road_traffic_density | 45584 | 601 | 4 | 1.318445 | object |
| Vehicle_condition | 45584 | 0 | 4 | 0.000000 | int64 |
| Type_of_order | 45584 | 0 | 4 | 0.000000 | object |
| Type_of_vehicle | 45584 | 0 | 4 | 0.000000 | object |
| multiple_deliveries | 45584 | 993 | 4 | 2.178396 | float64 |
| Festival | 45584 | 228 | 2 | 0.500176 | object |
| City | 45584 | 1200 | 3 | 2.632503 | object |
| Time_taken (min) | 45584 | 0 | 45 | 0.000000 | int64 |
In [9]:
df.drop('ID', axis=1, inplace=True)
In [10]:
df
Out[10]:
| Delivery_person_ID | Delivery_person_Age | Delivery_person_Ratings | Restaurant_latitude | Restaurant_longitude | Delivery_location_latitude | Delivery_location_longitude | Order_Date | Time_Orderd | Time_Order_picked | Weather_conditions | Road_traffic_density | Vehicle_condition | Type_of_order | Type_of_vehicle | multiple_deliveries | Festival | City | Time_taken (min) | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | DEHRES17DEL01 | 36.0 | 4.2 | 30.327968 | 78.046106 | 30.397968 | 78.116106 | 12-02-2022 | 21:55 | 22:10 | Fog | Jam | 2 | Snack | motorcycle | 3.0 | No | Metropolitian | 46 |
| 1 | KOCRES16DEL01 | 21.0 | 4.7 | 10.003064 | 76.307589 | 10.043064 | 76.347589 | 13-02-2022 | 14:55 | 15:05 | Stormy | High | 1 | Meal | motorcycle | 1.0 | No | Metropolitian | 23 |
| 2 | PUNERES13DEL03 | 23.0 | 4.7 | 18.562450 | 73.916619 | 18.652450 | 74.006619 | 04-03-2022 | 17:30 | 17:40 | Sandstorms | Medium | 1 | Drinks | scooter | 1.0 | No | Metropolitian | 21 |
| 3 | LUDHRES15DEL02 | 34.0 | 4.3 | 30.899584 | 75.809346 | 30.919584 | 75.829346 | 13-02-2022 | 09:20 | 09:30 | Sandstorms | Low | 0 | Buffet | motorcycle | 0.0 | No | Metropolitian | 20 |
| 4 | KNPRES14DEL02 | 24.0 | 4.7 | 26.463504 | 80.372929 | 26.593504 | 80.502929 | 14-02-2022 | 19:50 | 20:05 | Fog | Jam | 1 | Snack | scooter | 1.0 | No | Metropolitian | 41 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 45579 | JAPRES04DEL01 | 30.0 | 4.8 | 26.902328 | 75.794257 | 26.912328 | 75.804257 | 24-03-2022 | 11:35 | 11:45 | Windy | High | 1 | Meal | motorcycle | 0.0 | No | Metropolitian | 32 |
| 45580 | AGRRES16DEL01 | 21.0 | 4.6 | 0.000000 | 0.000000 | 0.070000 | 0.070000 | 16-02-2022 | 19:55 | 20:10 | Windy | Jam | 0 | Buffet | motorcycle | 1.0 | No | Metropolitian | 36 |
| 45581 | CHENRES08DEL03 | 30.0 | 4.9 | 13.022394 | 80.242439 | 13.052394 | 80.272439 | 11-03-2022 | 23:50 | 24:05:00 | Cloudy | Low | 1 | Drinks | scooter | 0.0 | No | Metropolitian | 16 |
| 45582 | COIMBRES11DEL01 | 20.0 | 4.7 | 11.001753 | 76.986241 | 11.041753 | 77.026241 | 07-03-2022 | 13:35 | 13:40 | Cloudy | High | 0 | Snack | motorcycle | 1.0 | No | Metropolitian | 26 |
| 45583 | RANCHIRES09DEL02 | 23.0 | 4.9 | 23.351058 | 85.325731 | 23.431058 | 85.405731 | 02-03-2022 | 17:10 | 17:15 | Fog | Medium | 2 | Snack | scooter | 1.0 | No | Metropolitian | 36 |
45584 rows × 19 columns
In [11]:
df['Order_Date'] = pd.to_datetime(df['Order_Date'], format='%d-%m-%Y')
df.head()
Out[11]:
| Delivery_person_ID | Delivery_person_Age | Delivery_person_Ratings | Restaurant_latitude | Restaurant_longitude | Delivery_location_latitude | Delivery_location_longitude | Order_Date | Time_Orderd | Time_Order_picked | Weather_conditions | Road_traffic_density | Vehicle_condition | Type_of_order | Type_of_vehicle | multiple_deliveries | Festival | City | Time_taken (min) | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | DEHRES17DEL01 | 36.0 | 4.2 | 30.327968 | 78.046106 | 30.397968 | 78.116106 | 2022-02-12 | 21:55 | 22:10 | Fog | Jam | 2 | Snack | motorcycle | 3.0 | No | Metropolitian | 46 |
| 1 | KOCRES16DEL01 | 21.0 | 4.7 | 10.003064 | 76.307589 | 10.043064 | 76.347589 | 2022-02-13 | 14:55 | 15:05 | Stormy | High | 1 | Meal | motorcycle | 1.0 | No | Metropolitian | 23 |
| 2 | PUNERES13DEL03 | 23.0 | 4.7 | 18.562450 | 73.916619 | 18.652450 | 74.006619 | 2022-03-04 | 17:30 | 17:40 | Sandstorms | Medium | 1 | Drinks | scooter | 1.0 | No | Metropolitian | 21 |
| 3 | LUDHRES15DEL02 | 34.0 | 4.3 | 30.899584 | 75.809346 | 30.919584 | 75.829346 | 2022-02-13 | 09:20 | 09:30 | Sandstorms | Low | 0 | Buffet | motorcycle | 0.0 | No | Metropolitian | 20 |
| 4 | KNPRES14DEL02 | 24.0 | 4.7 | 26.463504 | 80.372929 | 26.593504 | 80.502929 | 2022-02-14 | 19:50 | 20:05 | Fog | Jam | 1 | Snack | scooter | 1.0 | No | Metropolitian | 41 |
In [12]:
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 45584 entries, 0 to 45583 Data columns (total 19 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Delivery_person_ID 45584 non-null object 1 Delivery_person_Age 43730 non-null float64 2 Delivery_person_Ratings 43676 non-null float64 3 Restaurant_latitude 45584 non-null float64 4 Restaurant_longitude 45584 non-null float64 5 Delivery_location_latitude 45584 non-null float64 6 Delivery_location_longitude 45584 non-null float64 7 Order_Date 45584 non-null datetime64[ns] 8 Time_Orderd 43853 non-null object 9 Time_Order_picked 45584 non-null object 10 Weather_conditions 44968 non-null object 11 Road_traffic_density 44983 non-null object 12 Vehicle_condition 45584 non-null int64 13 Type_of_order 45584 non-null object 14 Type_of_vehicle 45584 non-null object 15 multiple_deliveries 44591 non-null float64 16 Festival 45356 non-null object 17 City 44384 non-null object 18 Time_taken (min) 45584 non-null int64 dtypes: datetime64[ns](1), float64(7), int64(2), object(9) memory usage: 6.6+ MB
In [13]:
df['year']= df['Order_Date'].dt.year
df['month']= df['Order_Date'].dt.month
df['day']= df['Order_Date'].dt.day
In [14]:
df.head()
Out[14]:
| Delivery_person_ID | Delivery_person_Age | Delivery_person_Ratings | Restaurant_latitude | Restaurant_longitude | Delivery_location_latitude | Delivery_location_longitude | Order_Date | Time_Orderd | Time_Order_picked | ... | Vehicle_condition | Type_of_order | Type_of_vehicle | multiple_deliveries | Festival | City | Time_taken (min) | year | month | day | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | DEHRES17DEL01 | 36.0 | 4.2 | 30.327968 | 78.046106 | 30.397968 | 78.116106 | 2022-02-12 | 21:55 | 22:10 | ... | 2 | Snack | motorcycle | 3.0 | No | Metropolitian | 46 | 2022 | 2 | 12 |
| 1 | KOCRES16DEL01 | 21.0 | 4.7 | 10.003064 | 76.307589 | 10.043064 | 76.347589 | 2022-02-13 | 14:55 | 15:05 | ... | 1 | Meal | motorcycle | 1.0 | No | Metropolitian | 23 | 2022 | 2 | 13 |
| 2 | PUNERES13DEL03 | 23.0 | 4.7 | 18.562450 | 73.916619 | 18.652450 | 74.006619 | 2022-03-04 | 17:30 | 17:40 | ... | 1 | Drinks | scooter | 1.0 | No | Metropolitian | 21 | 2022 | 3 | 4 |
| 3 | LUDHRES15DEL02 | 34.0 | 4.3 | 30.899584 | 75.809346 | 30.919584 | 75.829346 | 2022-02-13 | 09:20 | 09:30 | ... | 0 | Buffet | motorcycle | 0.0 | No | Metropolitian | 20 | 2022 | 2 | 13 |
| 4 | KNPRES14DEL02 | 24.0 | 4.7 | 26.463504 | 80.372929 | 26.593504 | 80.502929 | 2022-02-14 | 19:50 | 20:05 | ... | 1 | Snack | scooter | 1.0 | No | Metropolitian | 41 | 2022 | 2 | 14 |
5 rows × 22 columns
In [15]:
df.drop('Order_Date',axis=1,inplace=True)
In [16]:
df.dropna(subset=['Time_Orderd'],inplace=True)
In [17]:
df['Time_Orderd'] = df['Time_Orderd'].str.replace('.', ':')
In [18]:
df['Time_Orderd'].sample(10)
Out[18]:
10882 21:25 1257 18:15 32630 10:50 27965 17:55 11534 14:50 40590 23:30 43193 21:50 5624 10:30 15060 17:30 38883 14:30 Name: Time_Orderd, dtype: object
In [19]:
# Define a lambda function with error handling
def extract_time(x):
try:
return x.split(':')[0] + ':' + x.split(':')[1][:2]
except IndexError:
return '00:00'
# Apply the lambda function to the Order_Time column
df['Time_Orderd'] = df['Time_Orderd'].apply(extract_time)
In [20]:
# Convert the Order_Time column to a time data type
df['Time_Orderd'] = pd.to_datetime(df['Time_Orderd'], format='%H:%M:%S',errors='ignore')
In [21]:
df['TimeOrder_Hour']=df['Time_Orderd'].str.split(':',expand=True)[0] # 12:00 -> 12, 00
# replacing '0' with '00' - both indicate 24th hr
df['TimeOrder_Hour']=df['TimeOrder_Hour'].replace('0','00') #12:00
# changing the datatype
df['TimeOrder_Hour']=df['TimeOrder_Hour'].astype('int32')
In [22]:
# order_minute
df['TimeOrder_min']=df['Time_Orderd'].str.split(':',expand=True)[1]
# changing the datatype
df['TimeOrder_min']=df['TimeOrder_min'].astype('int32')
In [23]:
df.head()
Out[23]:
| Delivery_person_ID | Delivery_person_Age | Delivery_person_Ratings | Restaurant_latitude | Restaurant_longitude | Delivery_location_latitude | Delivery_location_longitude | Time_Orderd | Time_Order_picked | Weather_conditions | ... | Type_of_vehicle | multiple_deliveries | Festival | City | Time_taken (min) | year | month | day | TimeOrder_Hour | TimeOrder_min | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | DEHRES17DEL01 | 36.0 | 4.2 | 30.327968 | 78.046106 | 30.397968 | 78.116106 | 21:55 | 22:10 | Fog | ... | motorcycle | 3.0 | No | Metropolitian | 46 | 2022 | 2 | 12 | 21 | 55 |
| 1 | KOCRES16DEL01 | 21.0 | 4.7 | 10.003064 | 76.307589 | 10.043064 | 76.347589 | 14:55 | 15:05 | Stormy | ... | motorcycle | 1.0 | No | Metropolitian | 23 | 2022 | 2 | 13 | 14 | 55 |
| 2 | PUNERES13DEL03 | 23.0 | 4.7 | 18.562450 | 73.916619 | 18.652450 | 74.006619 | 17:30 | 17:40 | Sandstorms | ... | scooter | 1.0 | No | Metropolitian | 21 | 2022 | 3 | 4 | 17 | 30 |
| 3 | LUDHRES15DEL02 | 34.0 | 4.3 | 30.899584 | 75.809346 | 30.919584 | 75.829346 | 09:20 | 09:30 | Sandstorms | ... | motorcycle | 0.0 | No | Metropolitian | 20 | 2022 | 2 | 13 | 9 | 20 |
| 4 | KNPRES14DEL02 | 24.0 | 4.7 | 26.463504 | 80.372929 | 26.593504 | 80.502929 | 19:50 | 20:05 | Fog | ... | scooter | 1.0 | No | Metropolitian | 41 | 2022 | 2 | 14 | 19 | 50 |
5 rows × 23 columns
In [24]:
df['Time_Order_picked'] = df['Time_Order_picked'].str.replace('.', ':')
In [25]:
# Define a lambda function with error handling
def extract_time(x):
try:
return x.split(':')[0] + ':' + x.split(':')[1][:2]
except IndexError:
return '00:00'
# Apply the lambda function to the Order_Time column
df['Time_Order_picked'] = df['Time_Order_picked'].apply(extract_time)
In [26]:
# Convert the Order_Time column to a time data type
df['Time_Order_picked'] = pd.to_datetime(df['Time_Order_picked'], format='%H:%M:%S',errors='ignore')
In [27]:
df['Time_Order_picked_Hour']=df['Time_Order_picked'].str.split(':',expand=True)[0]
# changing the datatype
df['Time_Order_picked_Hour']=df['Time_Order_picked_Hour'].astype('int32')
In [28]:
# order_minute
df['Time_Order_picked_min']=df['Time_Order_picked'].str.split(':',expand=True)[1]
# changing the datatype
df['Time_Order_picked_min']=df['Time_Order_picked_min'].astype('int32')
extracting city from delivery person ID
In [29]:
# spilliting the Delivery_person_id on the bases of "RES", this will give us the city name of delivery
df['Delivery_city']=df['Delivery_person_ID'].str.split('RES',expand=True)[0]
In [30]:
df.head()
Out[30]:
| Delivery_person_ID | Delivery_person_Age | Delivery_person_Ratings | Restaurant_latitude | Restaurant_longitude | Delivery_location_latitude | Delivery_location_longitude | Time_Orderd | Time_Order_picked | Weather_conditions | ... | City | Time_taken (min) | year | month | day | TimeOrder_Hour | TimeOrder_min | Time_Order_picked_Hour | Time_Order_picked_min | Delivery_city | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | DEHRES17DEL01 | 36.0 | 4.2 | 30.327968 | 78.046106 | 30.397968 | 78.116106 | 21:55 | 22:10 | Fog | ... | Metropolitian | 46 | 2022 | 2 | 12 | 21 | 55 | 22 | 10 | DEH |
| 1 | KOCRES16DEL01 | 21.0 | 4.7 | 10.003064 | 76.307589 | 10.043064 | 76.347589 | 14:55 | 15:05 | Stormy | ... | Metropolitian | 23 | 2022 | 2 | 13 | 14 | 55 | 15 | 5 | KOC |
| 2 | PUNERES13DEL03 | 23.0 | 4.7 | 18.562450 | 73.916619 | 18.652450 | 74.006619 | 17:30 | 17:40 | Sandstorms | ... | Metropolitian | 21 | 2022 | 3 | 4 | 17 | 30 | 17 | 40 | PUNE |
| 3 | LUDHRES15DEL02 | 34.0 | 4.3 | 30.899584 | 75.809346 | 30.919584 | 75.829346 | 09:20 | 09:30 | Sandstorms | ... | Metropolitian | 20 | 2022 | 2 | 13 | 9 | 20 | 9 | 30 | LUDH |
| 4 | KNPRES14DEL02 | 24.0 | 4.7 | 26.463504 | 80.372929 | 26.593504 | 80.502929 | 19:50 | 20:05 | Fog | ... | Metropolitian | 41 | 2022 | 2 | 14 | 19 | 50 | 20 | 5 | KNP |
5 rows × 26 columns
In [31]:
df['Delivery_city'].unique()
Out[31]:
array(['DEH', 'KOC', 'PUNE', 'LUDH', 'KNP', 'MUM', 'MYS', 'HYD', 'KOL',
'RANCHI', 'COIMB', 'CHEN', 'JAP', 'SUR', 'BANG', 'GOA', 'AURG',
'AGR', 'VAD', 'ALH', 'BHP', 'INDO'], dtype=object)
In [32]:
df.columns
Out[32]:
Index(['Delivery_person_ID', 'Delivery_person_Age', 'Delivery_person_Ratings',
'Restaurant_latitude', 'Restaurant_longitude',
'Delivery_location_latitude', 'Delivery_location_longitude',
'Time_Orderd', 'Time_Order_picked', 'Weather_conditions',
'Road_traffic_density', 'Vehicle_condition', 'Type_of_order',
'Type_of_vehicle', 'multiple_deliveries', 'Festival', 'City',
'Time_taken (min)', 'year', 'month', 'day', 'TimeOrder_Hour',
'TimeOrder_min', 'Time_Order_picked_Hour', 'Time_Order_picked_min',
'Delivery_city'],
dtype='object')
In [33]:
df['Delivery_person_Ratings'].isnull().sum()
Out[33]:
268
filling the null value with the mean of that person id¶
In [34]:
rating_map = round(df.groupby('Delivery_person_ID')['Delivery_person_Ratings'].mean(),1).to_dict()
df['Delivery_person_Ratings']=df['Delivery_person_Ratings'].fillna(df['Delivery_person_ID'].map(rating_map))
df['Delivery_person_Ratings'].isnull().sum()
Out[34]:
0
working with MAP¶
In [35]:
folium.Map(tiles="cartodb positron")
Out[35]:
Make this Notebook Trusted to load map: File -> Trust Notebook
In [36]:
import plotly.express as px
# Make a copy of the DataFrame
data1 = df.copy()
# Drop rows with any missing values
data1.dropna(axis=0, how='any', subset=['Restaurant_latitude', 'Restaurant_longitude'], inplace=True)
# Define the color scale
color_scale = [(0, 'orange'), (1, 'red')]
# Create the scatter mapbox plot
fig = px.scatter_mapbox(
data1,
lat="Restaurant_latitude",
lon="Restaurant_longitude",
color_continuous_scale=color_scale,
zoom=8,
height=1000,
width=1000
)
# Update the map layout to use open-street-map style
fig.update_layout(mapbox_style="open-street-map")
# Remove the margins
fig.update_layout(margin={"r":0, "t":0, "l":0, "b":0})
# Show the plot
fig.show()
In [37]:
# latitude of india is 8°4` N to 37°6` North
# longitude of india is 68°7` E to 97°25` East
# this data set is realted to india. but in some of latitidude is given in negative(-), negative lattitude is related
# to indian ocean ..
# so that there is much chances that negative sign in lattitude is by mistake
In [38]:
# changing -ve to +ve dataset
"""
abs() function return the absolute value. The absolute
value of any number is always positive it removes the negative sign of a number in Python.
"""
df['Restaurant_latitude'] = df['Restaurant_latitude'].abs()
To calculate the distance between two points on a sphere (such as the Earth), we can use the Haversine formula¶
In [39]:
import math
def distance(lat1, lon1, lat2, lon2):
# Convert latitude and longitude to radians
lat1, lon1, lat2, lon2 = map(math.radians, [lat1, lon1, lat2, lon2])
# Haversine formula
dlat = lat2 - lat1
dlon = lon2 - lon1
a = math.sin(dlat/2)**2 + math.cos(lat1) * math.cos(lat2) * math.sin(dlon/2)**2
c = 2 * math.atan2(math.sqrt(a), math.sqrt(1-a))
R = 6371.0 # Earth's radius in km
dist = R * c
return dist
df['distance'] = df.apply(lambda row: distance(row['Restaurant_latitude'], row['Restaurant_longitude'], row['Delivery_location_latitude'], row['Delivery_location_longitude']), axis=1)
In [40]:
df['distance'].head()
Out[40]:
0 10.280582 1 6.242319 2 13.787860 3 2.930258 4 19.396618 Name: distance, dtype: float64
In [41]:
df.groupby(['Restaurant_latitude','Restaurant_longitude','Delivery_location_latitude','Delivery_location_longitude'])['distance'].max().head(40)
Out[41]:
Restaurant_latitude Restaurant_longitude Delivery_location_latitude Delivery_location_longitude
0.000000 0.000000 0.010000 0.010000 1.572534
0.020000 0.020000 3.145067
0.030000 0.030000 4.717601
0.040000 0.040000 6.290135
0.050000 0.050000 7.862668
0.060000 0.060000 9.435202
0.070000 0.070000 11.007735
0.080000 0.080000 12.580268
0.090000 0.090000 14.152801
0.110000 0.110000 17.297866
0.130000 0.130000 20.442930
9.957144 76.296783 9.967144 76.306783 1.560724
9.977144 76.316783 3.121424
9.987144 76.326783 4.682100
9.997144 76.336783 6.242753
10.007144 76.346783 7.803382
10.017144 76.356783 9.363988
10.027144 76.366783 10.924570
10.037144 76.376783 12.485128
10.047144 76.386783 14.045662
10.067144 76.406783 17.166660
10.087144 76.426783 20.287562
9.959778 76.296106 9.969778 76.306106 1.560717
9.979778 76.316106 3.121411
9.989778 76.326106 4.682082
9.999778 76.336106 6.242728
10.009778 76.346106 7.803351
10.019778 76.356106 9.363950
10.029778 76.366106 10.924526
10.039778 76.376106 12.485078
10.049778 76.386106 14.045606
10.069778 76.406106 17.166591
10.089778 76.426106 20.287481
9.960846 76.293936 9.970846 76.303936 1.560715
9.980846 76.313936 3.121406
9.990846 76.323936 4.682074
10.000846 76.333936 6.242718
10.010846 76.343936 7.803339
10.020846 76.353936 9.363935
10.030846 76.363936 10.924508
Name: distance, dtype: float64
In [42]:
# droping those lattitude and longitude which are less than and above to
# latitude of india is 8°4` N to 37°6` North
# longitude of india is 68°7` E to 97°25` East
In [43]:
df[(df['Restaurant_latitude'] < 8) & (df['Delivery_location_latitude'] < 8)].shape
Out[43]:
(3509, 27)
In [44]:
# droping where restaurant_lattitude and delivery_location_latitude less than 8
df = df.drop(df[(df['Restaurant_latitude'] < 8) & (df['Delivery_location_latitude'] < 8)].index)
In [45]:
df[(df['Restaurant_latitude'] > 37) & (df['Delivery_location_latitude'] > 37)]
Out[45]:
| Delivery_person_ID | Delivery_person_Age | Delivery_person_Ratings | Restaurant_latitude | Restaurant_longitude | Delivery_location_latitude | Delivery_location_longitude | Time_Orderd | Time_Order_picked | Weather_conditions | ... | Time_taken (min) | year | month | day | TimeOrder_Hour | TimeOrder_min | Time_Order_picked_Hour | Time_Order_picked_min | Delivery_city | distance |
|---|
0 rows × 27 columns
In [46]:
df[(df['Restaurant_longitude'] < 68) & (df['Delivery_location_longitude'] < 68)]
Out[46]:
| Delivery_person_ID | Delivery_person_Age | Delivery_person_Ratings | Restaurant_latitude | Restaurant_longitude | Delivery_location_latitude | Delivery_location_longitude | Time_Orderd | Time_Order_picked | Weather_conditions | ... | Time_taken (min) | year | month | day | TimeOrder_Hour | TimeOrder_min | Time_Order_picked_Hour | Time_Order_picked_min | Delivery_city | distance |
|---|
0 rows × 27 columns
In [47]:
df.columns
Out[47]:
Index(['Delivery_person_ID', 'Delivery_person_Age', 'Delivery_person_Ratings',
'Restaurant_latitude', 'Restaurant_longitude',
'Delivery_location_latitude', 'Delivery_location_longitude',
'Time_Orderd', 'Time_Order_picked', 'Weather_conditions',
'Road_traffic_density', 'Vehicle_condition', 'Type_of_order',
'Type_of_vehicle', 'multiple_deliveries', 'Festival', 'City',
'Time_taken (min)', 'year', 'month', 'day', 'TimeOrder_Hour',
'TimeOrder_min', 'Time_Order_picked_Hour', 'Time_Order_picked_min',
'Delivery_city', 'distance'],
dtype='object')
In [48]:
min_distance=df.groupby('distance')[['Restaurant_latitude','Restaurant_longitude','Delivery_location_latitude',
'Delivery_location_longitude']].min().sort_values(by='distance',ascending=True).head(1)
min_distance
Out[48]:
| Restaurant_latitude | Restaurant_longitude | Delivery_location_latitude | Delivery_location_longitude | |
|---|---|---|---|---|
| distance | ||||
| 1.465067 | 30.914057 | 75.83982 | 30.924057 | 75.84982 |
In [49]:
import folium
from branca.element import Figure
fig2=Figure(width=850,height=550)
# Create a map object centered on a specific location
m = folium.Map(location=[30.914057, 75.83982], zoom_start=12)
# Add a marker for the restaurant location
folium.Marker(location=[30.914057, 75.83982], popup="Restaurant",icon=folium.Icon(color='purple',prefix='fa',icon='anchor')).add_to(m)
# Add a marker for the delivery location
folium.Marker(location=[30.924057, 75.84982], popup="Delivery").add_to(m)
# Draw a line between the two locations
folium.PolyLine(locations=[[30.914057, 75.83982], [30.924057, 75.84982]], color='red').add_to(m)
# Display the map
fig2.add_child(m)
folium.TileLayer('Stamen Terrain').add_to(m)
folium.TileLayer('Stamen Toner').add_to(m)
folium.TileLayer('Stamen Water Color').add_to(m)
folium.TileLayer('cartodbpositron').add_to(m)
folium.TileLayer('cartodbdark_matter').add_to(m)
folium.LayerControl().add_to(m)
m
Out[49]:
In [50]:
max_distance=df.groupby('distance')[['Restaurant_latitude','Restaurant_longitude','Delivery_location_latitude',
'Delivery_location_longitude']].max().sort_values(by='distance',ascending=False).head(1)
max_distance
Out[50]:
| Restaurant_latitude | Restaurant_longitude | Delivery_location_latitude | Delivery_location_longitude | |
|---|---|---|---|---|
| distance | ||||
| 20.969489 | 25.443994 | 81.860187 | 25.583994 | 82.000187 |
In [51]:
tooltips=f"Distance: {df['distance'].max()} km"
In [52]:
import folium
from branca.element import Figure
fig2=Figure(width=850,height=550)
# Create a map object centered on a specific location
m = folium.Map(location=[25.443994, 81.860187], zoom_start=12)
# Add a marker for the restaurant location
folium.Marker(location=[25.443994, 81.860187], popup="Restaurant",icon=folium.Icon(color='purple',prefix='fa',icon='anchor')).add_to(m)
# Add a marker for the delivery location
folium.Marker(location=[25.583994, 82.000187], popup="Delivery").add_to(m)
# Draw a line between the two locations
folium.PolyLine(locations=[[25.443994, 81.860187], [25.583994,82.000187]], color='red',tooltip=tooltips).add_to(m)
# Display the map
fig2.add_child(m)
folium.TileLayer('Stamen Terrain').add_to(m)
folium.TileLayer('Stamen Toner').add_to(m)
folium.TileLayer('Stamen Water Color').add_to(m)
folium.TileLayer('cartodbpositron').add_to(m)
folium.TileLayer('cartodbdark_matter').add_to(m)
folium.LayerControl().add_to(m)
m
Out[52]:
In [53]:
import plotly.express as px
# Copy the DataFrame
data1 = df.copy()
# Drop rows with any missing values
data1.dropna(axis=0, how='any', subset=['Restaurant_latitude', 'Restaurant_longitude'], inplace=True)
# Define the color scale
color_scale = [(0, 'orange'), (1, 'red')]
# Create the scatter mapbox plot
fig = px.scatter_mapbox(
data1,
lat="Restaurant_latitude",
lon="Restaurant_longitude",
color_continuous_scale=color_scale,
zoom=8,
height=1000,
width=1000
)
# Update the map layout to use open-street-map style
fig.update_layout(mapbox_style="open-street-map")
# Remove the margins
fig.update_layout(margin={"r":0, "t":0, "l":0, "b":0})
In [54]:
import matplotlib.pyplot as plt
import seaborn as sns
# Create a figure with specified size
plt.figure(figsize=(12, 8))
# Use sns.countplot with the DataFrame and column name
sns.countplot(x='Delivery_city', data=df)
# Rotate the x-ticks for better readability
plt.xticks(rotation='vertical')
# Show the plot
plt.show()
In [55]:
plt.figure(figsize=(12, 8))
# Use sns.countplot with the DataFrame and column name specified via the x parameter
sns.countplot(x='Weather_conditions', data=df)
# Rotate the x-ticks for better readability
plt.xticks(rotation='vertical')
# Show the plot
plt.show()
In [56]:
plt.figure(figsize=(12,8))
sns.countplot(x='Road_traffic_density', data=df)
plt.xticks(rotation='vertical')
plt.show()
In [57]:
plt.figure(figsize=(12,8))
sns.countplot(x='day', data=df)
plt.xticks(rotation='vertical')
plt.show()
In [58]:
plt.figure(figsize=(12,8))
sns.countplot(x='Vehicle_condition', data=df)
plt.xticks(rotation='vertical')
plt.show()
In [59]:
numerical_feature=[feature for feature in df.columns if df[feature].dtypes != 'O']
numerical_feature
df.drop('Delivery_person_ID',axis=1,inplace=True)
df.drop(['Time_Orderd','Time_Order_picked'],axis=1,inplace=True)
Splitting in X and Y¶
In [60]:
X = df.drop(labels=['Time_taken (min)'],axis=1)
y = df[['Time_taken (min)']]
In [61]:
num_col=[feature for feature in df.columns if df[feature].dtypes != 'O']
num_col
Out[61]:
['Delivery_person_Age', 'Delivery_person_Ratings', 'Restaurant_latitude', 'Restaurant_longitude', 'Delivery_location_latitude', 'Delivery_location_longitude', 'Vehicle_condition', 'multiple_deliveries', 'Time_taken (min)', 'year', 'month', 'day', 'TimeOrder_Hour', 'TimeOrder_min', 'Time_Order_picked_Hour', 'Time_Order_picked_min', 'distance']
In [62]:
cat_col=[feature for feature in df.columns if df[feature].dtypes == 'O']
cat_col
Out[62]:
['Weather_conditions', 'Road_traffic_density', 'Type_of_order', 'Type_of_vehicle', 'Festival', 'City', 'Delivery_city']
In [63]:
df[cat_col].head()
Out[63]:
| Weather_conditions | Road_traffic_density | Type_of_order | Type_of_vehicle | Festival | City | Delivery_city | |
|---|---|---|---|---|---|---|---|
| 0 | Fog | Jam | Snack | motorcycle | No | Metropolitian | DEH |
| 1 | Stormy | High | Meal | motorcycle | No | Metropolitian | KOC |
| 2 | Sandstorms | Medium | Drinks | scooter | No | Metropolitian | PUNE |
| 3 | Sandstorms | Low | Buffet | motorcycle | No | Metropolitian | LUDH |
| 4 | Fog | Jam | Snack | scooter | No | Metropolitian | KNP |
In [64]:
df[df['Road_traffic_density']=='Low']
Out[64]:
| Delivery_person_Age | Delivery_person_Ratings | Restaurant_latitude | Restaurant_longitude | Delivery_location_latitude | Delivery_location_longitude | Weather_conditions | Road_traffic_density | Vehicle_condition | Type_of_order | ... | Time_taken (min) | year | month | day | TimeOrder_Hour | TimeOrder_min | Time_Order_picked_Hour | Time_Order_picked_min | Delivery_city | distance | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 3 | 34.0 | 4.3 | 30.899584 | 75.809346 | 30.919584 | 75.829346 | Sandstorms | Low | 0 | Buffet | ... | 20 | 2022 | 2 | 13 | 9 | 20 | 9 | 30 | LUDH | 2.930258 |
| 11 | 31.0 | 4.7 | 23.357804 | 85.325146 | 23.487804 | 85.455146 | Sandstorms | Low | 2 | Meal | ... | 26 | 2022 | 3 | 10 | 22 | 30 | 22 | 45 | RANCHI | 19.618739 |
| 12 | 37.0 | 5.0 | 11.003669 | 76.976494 | 11.013669 | 76.986494 | Sandstorms | Low | 1 | Snack | ... | 20 | 2022 | 3 | 11 | 8 | 15 | 8 | 30 | COIMB | 1.558132 |
| 17 | 35.0 | 4.3 | 21.160522 | 72.771477 | 21.250522 | 72.861477 | Cloudy | Low | 1 | Drinks | ... | 38 | 2022 | 3 | 12 | 23 | 20 | 23 | 30 | SUR | 13.682045 |
| 19 | 23.0 | 4.8 | 18.514210 | 73.838429 | 18.624210 | 73.948429 | Windy | Low | 2 | Buffet | ... | 12 | 2022 | 4 | 2 | 23 | 35 | 23 | 45 | PUNE | 16.853619 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 45569 | 30.0 | 4.9 | 11.025083 | 77.015393 | 11.045083 | 77.035393 | Windy | Low | 1 | Snack | ... | 26 | 2022 | 3 | 11 | 10 | 25 | 10 | 40 | COIMB | 3.116127 |
| 45571 | 34.0 | 4.9 | 12.297954 | 76.665169 | 12.317954 | 76.685169 | Sandstorms | Low | 1 | Snack | ... | 26 | 2022 | 4 | 5 | 9 | 10 | 9 | 15 | MYS | 3.109134 |
| 45573 | 30.0 | 4.6 | 18.562450 | 73.916619 | 18.582450 | 73.936619 | Cloudy | Low | 0 | Meal | ... | 20 | 2022 | 4 | 1 | 0 | 37 | 9 | 15 | PUNE | 3.064267 |
| 45575 | 37.0 | 4.8 | 13.022394 | 80.242439 | 13.042394 | 80.262439 | Sandstorms | Low | 2 | Drinks | ... | 20 | 2022 | 4 | 5 | 9 | 35 | 9 | 50 | CHEN | 3.104827 |
| 45581 | 30.0 | 4.9 | 13.022394 | 80.242439 | 13.052394 | 80.272439 | Cloudy | Low | 1 | Drinks | ... | 16 | 2022 | 3 | 11 | 23 | 50 | 24 | 5 | CHEN | 4.657195 |
13815 rows × 24 columns
In [65]:
Road_traffic_density=['Low','Medium','High','Jam']
Weather_conditions=['Sunny','Cloudy','Windy','Fog','Sandstorms','Stormy']
In [66]:
categorical_column=['Type_of_order','Type_of_vehicle','Festival','City','Delivery_city']
ordinal_encod=['Road_traffic_density','Weather_conditions']
numerical_column=['Delivery_person_Age','Delivery_person_Ratings','Vehicle_condition','multiple_deliveries',
'TimeOrder_Hour','distance']
In [67]:
# numerical pipeline
numerical_pipeline=Pipeline(steps=[
('impute',SimpleImputer(strategy='constant',fill_value=0)),
('scaler',StandardScaler(with_mean=False))
])
# categorical pipeline
categorical_pipeline=Pipeline(steps=[
('impute',SimpleImputer(strategy='most_frequent')),
('onehot',OneHotEncoder(handle_unknown='ignore')),
('scaler',StandardScaler(with_mean=False))
])
# ordinal pipeline
ordianl_pipeline=Pipeline(steps=[
('impute',SimpleImputer(strategy='most_frequent')),
('ordinal',OrdinalEncoder(categories=[Road_traffic_density,Weather_conditions])),
('scaler',StandardScaler(with_mean=False))
])
In [68]:
preprocessor =ColumnTransformer([
('numerical_pipeline',numerical_pipeline,numerical_column),
('categorical_pipeline',categorical_pipeline,categorical_column),
('ordianl_pipeline',ordianl_pipeline,ordinal_encod)
])
In [69]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.20,random_state=42)
In [70]:
X_train=preprocessor.fit_transform(X_train)
X_test=preprocessor.transform(X_test)
In [71]:
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso,Ridge,ElasticNet
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor
from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error
In [72]:
def evaluate_reg(true, predicted):
r2 = r2_score(true, predicted) # Calculate r2 score
MAE = mean_absolute_error(true, predicted) # Calculate MAE
MSE = mean_squared_error(true, predicted) # Calculate MSE
rmse = np.sqrt(mean_squared_error(true, predicted))
return r2, MAE , MSE,rmse
In [73]:
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression, Ridge
from xgboost import XGBRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
models = {
"Random Forest": RandomForestRegressor(),
"Decision Tree": DecisionTreeRegressor(),
"Gradient Boosting": GradientBoostingRegressor(),
"linear Regression": LinearRegression(),
"Ridge":Ridge(),
"XGBRegressor": XGBRegressor(),
}
r2_list=[]
mse_list=[]
models_list = []
for i in range(len(list(models))):
model = list(models.values())[i]
model.fit(X_train, y_train) # Train model
# Make predictions
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)
# Test set performance
r2,MAE,MSE,rmse=evaluate_reg(y_test, y_test_pred)
r2_train,MAE_train,MSE_train,rmse_train=evaluate_reg(y_train, y_train_pred)
print(list(models.keys())[i])
models_list.append(list(models.keys())[i])
print('Model performance for Test set')
print('- r2_score: {:.4f}'.format(r2))
print('- MAE: {:.4f}'.format(MAE))
print('- MSE: {:.4f}'.format(MSE))
print('- rmse: {:.4f}'.format(rmse))
r2_list.append(r2)
mse_list.append(MSE)
print('-'*35)
print('Model performance for train set')
print('- r2_score: {:.4f}'.format(r2_train))
print('- MAE: {:.4f}'.format(MAE_train))
print('- MSE: {:.4f}'.format(MSE_train))
print('- rmse: {:.4f}'.format(rmse_train))
print('='*35)
print('='*35)
print('\n')
Random Forest Model performance for Test set - r2_score: 0.8283 - MAE: 3.1358 - MSE: 15.3218 - rmse: 3.9143 ----------------------------------- Model performance for train set - r2_score: 0.9759 - MAE: 1.1483 - MSE: 2.1068 - rmse: 1.4515 =================================== =================================== Decision Tree Model performance for Test set - r2_score: 0.6935 - MAE: 3.9932 - MSE: 27.3604 - rmse: 5.2307 ----------------------------------- Model performance for train set - r2_score: 1.0000 - MAE: 0.0000 - MSE: 0.0000 - rmse: 0.0000 =================================== =================================== Gradient Boosting Model performance for Test set - r2_score: 0.7697 - MAE: 3.6335 - MSE: 20.5584 - rmse: 4.5341 ----------------------------------- Model performance for train set - r2_score: 0.7740 - MAE: 3.5680 - MSE: 19.7730 - rmse: 4.4467 =================================== =================================== linear Regression Model performance for Test set - r2_score: 0.5351 - MAE: 5.1319 - MSE: 41.4926 - rmse: 6.4415 ----------------------------------- Model performance for train set - r2_score: 0.5396 - MAE: 5.0371 - MSE: 40.2769 - rmse: 6.3464 =================================== =================================== Ridge Model performance for Test set - r2_score: 0.5352 - MAE: 5.1301 - MSE: 41.4830 - rmse: 6.4407 ----------------------------------- Model performance for train set - r2_score: 0.5396 - MAE: 5.0352 - MSE: 40.2714 - rmse: 6.3460 =================================== =================================== XGBRegressor Model performance for Test set - r2_score: 0.8288 - MAE: 3.1347 - MSE: 15.2790 - rmse: 3.9088 ----------------------------------- Model performance for train set - r2_score: 0.8752 - MAE: 2.6442 - MSE: 10.9201 - rmse: 3.3046 =================================== ===================================
In [74]:
Results = pd.DataFrame(list(zip(models_list, r2_list)), columns=['Model Name', 'r2_score'])
Results.sort_values(by=["r2_score"],ascending=False)
Out[74]:
| Model Name | r2_score | |
|---|---|---|
| 5 | XGBRegressor | 0.828819 |
| 0 | Random Forest | 0.828340 |
| 2 | Gradient Boosting | 0.769671 |
| 1 | Decision Tree | 0.693463 |
| 4 | Ridge | 0.535238 |
| 3 | linear Regression | 0.535130 |
In [76]:
Results = pd.DataFrame(list(zip(models_list, mse_list)), columns=['Model Name', 'Mse'])
Results.sort_values(by=["Mse"],ascending=True)
Out[76]:
| Model Name | Mse | |
|---|---|---|
| 5 | XGBRegressor | 15.278972 |
| 0 | Random Forest | 15.321797 |
| 2 | Gradient Boosting | 20.558401 |
| 1 | Decision Tree | 27.360392 |
| 4 | Ridge | 41.482991 |
| 3 | linear Regression | 41.492630 |